{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import requests\n",
    "from lxml import etree\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4054, 6)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_path = r\"C:\\Users\\Olive\\python_projects\\FinRL-Meta\\FinGPT-ChatGLM-Fineturning_old\\Chinese_News\\scrape\\results\\000001.csv\"\n",
    "df = pd.read_csv(df_path)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read amount</th>\n",
       "      <th>comments</th>\n",
       "      <th>title</th>\n",
       "      <th>content link</th>\n",
       "      <th>author</th>\n",
       "      <th>create time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>496</td>\n",
       "      <td>0</td>\n",
       "      <td>平安银行：融资净买入3199.17万元，融资余额39.29亿元</td>\n",
       "      <td>/news,000001,1294939786.html</td>\n",
       "      <td>平安银行资讯</td>\n",
       "      <td>04-07 08:46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5448</td>\n",
       "      <td>20</td>\n",
       "      <td>平安银行今日大宗交易折价成交198万股 成交额2257.2万</td>\n",
       "      <td>/news,000001,1294874925.html</td>\n",
       "      <td>平安银行资讯</td>\n",
       "      <td>04-06 21:54</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  read amount  comments                            title  \\\n",
       "0         496         0  平安银行：融资净买入3199.17万元，融资余额39.29亿元   \n",
       "1        5448        20   平安银行今日大宗交易折价成交198万股 成交额2257.2万   \n",
       "\n",
       "                   content link  author  create time  \n",
       "0  /news,000001,1294939786.html  平安银行资讯  04-07 08:46  \n",
       "1  /news,000001,1294874925.html  平安银行资讯  04-06 21:54  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://guba.eastmoney.com/news,000001,1294939786.html'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "link_base = \"https://guba.eastmoney.com\"\n",
    "url = link_base + df.head(1)[\"content link\"][0]\n",
    "url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {\n",
    "        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0\",\n",
    "        \"Referer\": \"https://guba.eastmoney.com/\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response = requests.get(url = url, headers = headers,)\n",
    "response.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>post_id</th>\n",
       "      <th>post_user</th>\n",
       "      <th>post_guba</th>\n",
       "      <th>post_title</th>\n",
       "      <th>post_content</th>\n",
       "      <th>post_abstract</th>\n",
       "      <th>post_publish_time</th>\n",
       "      <th>post_last_time</th>\n",
       "      <th>post_display_time</th>\n",
       "      <th>post_ip</th>\n",
       "      <th>...</th>\n",
       "      <th>post_add_time</th>\n",
       "      <th>post_modules</th>\n",
       "      <th>post_speccolumn</th>\n",
       "      <th>post_ip_address</th>\n",
       "      <th>source_post_ip_address</th>\n",
       "      <th>post_mod_time</th>\n",
       "      <th>post_mod_count</th>\n",
       "      <th>allow_likes_state</th>\n",
       "      <th>system_comment_authority</th>\n",
       "      <th>limit_reply_user_auth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1294939786</td>\n",
       "      <td>{'user_id': '6385113638007064', 'user_nickname...</td>\n",
       "      <td>{'stockbar_type': 2, 'stockbar_code': '000001'...</td>\n",
       "      <td>平安银行：融资净买入3199.17万元，融资余额39.29亿元（04-06）</td>\n",
       "      <td>&lt;div data-type=\"abstract\" data-abstract=\"2023年...</td>\n",
       "      <td>2023年4月6日平安银行融资净买入3199.17万元，融资余额39.29亿元</td>\n",
       "      <td>2023-04-07 08:46:55</td>\n",
       "      <td>2023-04-07 08:46:55</td>\n",
       "      <td>2023-04-07 08:46:55</td>\n",
       "      <td></td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>[]</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 85 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      post_id                                          post_user  \\\n",
       "0  1294939786  {'user_id': '6385113638007064', 'user_nickname...   \n",
       "\n",
       "                                           post_guba  \\\n",
       "0  {'stockbar_type': 2, 'stockbar_code': '000001'...   \n",
       "\n",
       "                               post_title  \\\n",
       "0  平安银行：融资净买入3199.17万元，融资余额39.29亿元（04-06）   \n",
       "\n",
       "                                        post_content  \\\n",
       "0  <div data-type=\"abstract\" data-abstract=\"2023年...   \n",
       "\n",
       "                             post_abstract    post_publish_time  \\\n",
       "0  2023年4月6日平安银行融资净买入3199.17万元，融资余额39.29亿元  2023-04-07 08:46:55   \n",
       "\n",
       "        post_last_time    post_display_time post_ip  ... post_add_time  \\\n",
       "0  2023-04-07 08:46:55  2023-04-07 08:46:55          ...          None   \n",
       "\n",
       "  post_modules post_speccolumn post_ip_address source_post_ip_address  \\\n",
       "0           []            None                                          \n",
       "\n",
       "  post_mod_time post_mod_count allow_likes_state system_comment_authority  \\\n",
       "0          None              0                 0                        0   \n",
       "\n",
       "  limit_reply_user_auth  \n",
       "0                     0  \n",
       "\n",
       "[1 rows x 85 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = etree.HTML(response.text)\n",
    "res = res.xpath(\"//script[2]//text()\")[0]\n",
    "res = json.loads(res[17:])\n",
    "res = pd.Series(res).to_frame().T\n",
    "res"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "finrl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
